1.import labarary

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ✔ readr     2.1.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:dplyr':
## 
##     recode

2.import dataset

df = read.csv("marketing_campaign.csv", sep = "\t", header=T)
head(df)
##     ID Year_Birth  Education Marital_Status Income Kidhome Teenhome Dt_Customer
## 1 5524       1957 Graduation         Single  58138       0        0  04-09-2012
## 2 2174       1954 Graduation         Single  46344       1        1  08-03-2014
## 3 4141       1965 Graduation       Together  71613       0        0  21-08-2013
## 4 6182       1984 Graduation       Together  26646       1        0  10-02-2014
## 5 5324       1981        PhD        Married  58293       1        0  19-01-2014
## 6 7446       1967     Master       Together  62513       0        1  09-09-2013
##   Recency MntWines MntFruits MntMeatProducts MntFishProducts MntSweetProducts
## 1      58      635        88             546             172               88
## 2      38       11         1               6               2                1
## 3      26      426        49             127             111               21
## 4      26       11         4              20              10                3
## 5      94      173        43             118              46               27
## 6      16      520        42              98               0               42
##   MntGoldProds NumDealsPurchases NumWebPurchases NumCatalogPurchases
## 1           88                 3               8                  10
## 2            6                 2               1                   1
## 3           42                 1               8                   2
## 4            5                 2               2                   0
## 5           15                 5               5                   3
## 6           14                 2               6                   4
##   NumStorePurchases NumWebVisitsMonth AcceptedCmp3 AcceptedCmp4 AcceptedCmp5
## 1                 4                 7            0            0            0
## 2                 2                 5            0            0            0
## 3                10                 4            0            0            0
## 4                 4                 6            0            0            0
## 5                 6                 5            0            0            0
## 6                10                 6            0            0            0
##   AcceptedCmp1 AcceptedCmp2 Complain Z_CostContact Z_Revenue Response
## 1            0            0        0             3        11        1
## 2            0            0        0             3        11        0
## 3            0            0        0             3        11        0
## 4            0            0        0             3        11        0
## 5            0            0        0             3        11        0
## 6            0            0        0             3        11        0

3.Data Cleaning

# Fill NA on variable "Income"
df <- df %>%
  group_by(Education) %>%
  mutate(Income = ifelse(is.na(Income), mean(Income, na.rm = TRUE), Income)) %>%
  ungroup()

# Convert 'Year_Birth' to 'Age' for better understanding
df$Age <- 2024 - df$Year_Birth
df <- df[, !(names(df) %in% "Year_Birth")]  # Remove the original 'Year_Birth' column

# Merge 'Kidhome' and 'Teenhome' into 'Children'
df$Children <- df$Kidhome + df$Teenhome
df <- df[, !(names(df) %in% c("Kidhome", "Teenhome"))]  # Remove the original 'Kidhome' and 'Teenhome' columns

# Referring to the parenthood status
df$Parental_Status <- as.integer(df$Children != 0)

# Count Customer's total spent
df$Total_Spent <- df$MntWines + df$MntFruits + df$MntMeatProducts +
                  df$MntFishProducts + df$MntSweetProducts + df$MntGoldProds

# Count total Cmp accepted by Customer
df$Total_Offer <- df$AcceptedCmp1 + df$AcceptedCmp2 + df$AcceptedCmp3 +
                  df$AcceptedCmp4 + df$AcceptedCmp5

# Count total purchases by all methods
df$Num_Total_Purchases <- df$NumWebPurchases + df$NumCatalogPurchases +
                          df$NumStorePurchases + df$NumDealsPurchases
head(df)
## # A tibble: 6 × 32
##      ID Education  Marital_Status Income Dt_Customer Recency MntWines MntFruits
##   <int> <chr>      <chr>           <dbl> <chr>         <int>    <int>     <int>
## 1  5524 Graduation Single          58138 04-09-2012       58      635        88
## 2  2174 Graduation Single          46344 08-03-2014       38       11         1
## 3  4141 Graduation Together        71613 21-08-2013       26      426        49
## 4  6182 Graduation Together        26646 10-02-2014       26       11         4
## 5  5324 PhD        Married         58293 19-01-2014       94      173        43
## 6  7446 Master     Together        62513 09-09-2013       16      520        42
## # ℹ 24 more variables: MntMeatProducts <int>, MntFishProducts <int>,
## #   MntSweetProducts <int>, MntGoldProds <int>, NumDealsPurchases <int>,
## #   NumWebPurchases <int>, NumCatalogPurchases <int>, NumStorePurchases <int>,
## #   NumWebVisitsMonth <int>, AcceptedCmp3 <int>, AcceptedCmp4 <int>,
## #   AcceptedCmp5 <int>, AcceptedCmp1 <int>, AcceptedCmp2 <int>, Complain <int>,
## #   Z_CostContact <int>, Z_Revenue <int>, Response <int>, Age <dbl>,
## #   Children <int>, Parental_Status <int>, Total_Spent <int>, …

4.Descriptive Statistic

Descriptive Statistics:

# numerical variables
print('Basic Statistic for numurical variables')
## [1] "Basic Statistic for numurical variables"
summary(df)
##        ID         Education         Marital_Status         Income      
##  Min.   :    0   Length:2240        Length:2240        Min.   :  1730  
##  1st Qu.: 2828   Class :character   Class :character   1st Qu.: 35539  
##  Median : 5458   Mode  :character   Mode  :character   Median : 51610  
##  Mean   : 5592                                         Mean   : 52254  
##  3rd Qu.: 8428                                         3rd Qu.: 68290  
##  Max.   :11191                                         Max.   :666666  
##  Dt_Customer           Recency         MntWines         MntFruits    
##  Length:2240        Min.   : 0.00   Min.   :   0.00   Min.   :  0.0  
##  Class :character   1st Qu.:24.00   1st Qu.:  23.75   1st Qu.:  1.0  
##  Mode  :character   Median :49.00   Median : 173.50   Median :  8.0  
##                     Mean   :49.11   Mean   : 303.94   Mean   : 26.3  
##                     3rd Qu.:74.00   3rd Qu.: 504.25   3rd Qu.: 33.0  
##                     Max.   :99.00   Max.   :1493.00   Max.   :199.0  
##  MntMeatProducts MntFishProducts  MntSweetProducts  MntGoldProds   
##  Min.   :   0    Min.   :  0.00   Min.   :  0.00   Min.   :  0.00  
##  1st Qu.:  16    1st Qu.:  3.00   1st Qu.:  1.00   1st Qu.:  9.00  
##  Median :  67    Median : 12.00   Median :  8.00   Median : 24.00  
##  Mean   : 167    Mean   : 37.53   Mean   : 27.06   Mean   : 44.02  
##  3rd Qu.: 232    3rd Qu.: 50.00   3rd Qu.: 33.00   3rd Qu.: 56.00  
##  Max.   :1725    Max.   :259.00   Max.   :263.00   Max.   :362.00  
##  NumDealsPurchases NumWebPurchases  NumCatalogPurchases NumStorePurchases
##  Min.   : 0.000    Min.   : 0.000   Min.   : 0.000      Min.   : 0.00    
##  1st Qu.: 1.000    1st Qu.: 2.000   1st Qu.: 0.000      1st Qu.: 3.00    
##  Median : 2.000    Median : 4.000   Median : 2.000      Median : 5.00    
##  Mean   : 2.325    Mean   : 4.085   Mean   : 2.662      Mean   : 5.79    
##  3rd Qu.: 3.000    3rd Qu.: 6.000   3rd Qu.: 4.000      3rd Qu.: 8.00    
##  Max.   :15.000    Max.   :27.000   Max.   :28.000      Max.   :13.00    
##  NumWebVisitsMonth  AcceptedCmp3      AcceptedCmp4      AcceptedCmp5    
##  Min.   : 0.000    Min.   :0.00000   Min.   :0.00000   Min.   :0.00000  
##  1st Qu.: 3.000    1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000  
##  Median : 6.000    Median :0.00000   Median :0.00000   Median :0.00000  
##  Mean   : 5.317    Mean   :0.07277   Mean   :0.07455   Mean   :0.07277  
##  3rd Qu.: 7.000    3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000  
##  Max.   :20.000    Max.   :1.00000   Max.   :1.00000   Max.   :1.00000  
##   AcceptedCmp1      AcceptedCmp2        Complain        Z_CostContact
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.000000   Min.   :3    
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.000000   1st Qu.:3    
##  Median :0.00000   Median :0.00000   Median :0.000000   Median :3    
##  Mean   :0.06429   Mean   :0.01339   Mean   :0.009375   Mean   :3    
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.000000   3rd Qu.:3    
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.000000   Max.   :3    
##    Z_Revenue     Response           Age            Children     
##  Min.   :11   Min.   :0.0000   Min.   : 28.00   Min.   :0.0000  
##  1st Qu.:11   1st Qu.:0.0000   1st Qu.: 47.00   1st Qu.:0.0000  
##  Median :11   Median :0.0000   Median : 54.00   Median :1.0000  
##  Mean   :11   Mean   :0.1491   Mean   : 55.19   Mean   :0.9504  
##  3rd Qu.:11   3rd Qu.:0.0000   3rd Qu.: 65.00   3rd Qu.:1.0000  
##  Max.   :11   Max.   :1.0000   Max.   :131.00   Max.   :3.0000  
##  Parental_Status   Total_Spent       Total_Offer     Num_Total_Purchases
##  Min.   :0.0000   Min.   :   5.00   Min.   :0.0000   Min.   : 0.00      
##  1st Qu.:0.0000   1st Qu.:  68.75   1st Qu.:0.0000   1st Qu.: 8.00      
##  Median :1.0000   Median : 396.00   Median :0.0000   Median :15.00      
##  Mean   :0.7152   Mean   : 605.80   Mean   :0.2978   Mean   :14.86      
##  3rd Qu.:1.0000   3rd Qu.:1045.50   3rd Qu.:0.0000   3rd Qu.:21.00      
##  Max.   :1.0000   Max.   :2525.00   Max.   :4.0000   Max.   :44.00
# categorical variables
print("Basic Statistic for categorical variables")
## [1] "Basic Statistic for categorical variables"
table_counts <- table(df$Education)
percentage_counts <- prop.table(table_counts) * 100
summary_table <- data.frame(Counts = table_counts, Percentages = percentage_counts)
print(summary_table)
##   Counts.Var1 Counts.Freq Percentages.Var1 Percentages.Freq
## 1    2n Cycle         203         2n Cycle         9.062500
## 2       Basic          54            Basic         2.410714
## 3  Graduation        1127       Graduation        50.312500
## 4      Master         370           Master        16.517857
## 5         PhD         486              PhD        21.696429
table_counts <- table(df$Marital_Status)
percentage_counts <- prop.table(table_counts) * 100
summary_table <- data.frame(Counts = table_counts, Percentages = percentage_counts)
print(summary_table)
##   Counts.Var1 Counts.Freq Percentages.Var1 Percentages.Freq
## 1      Absurd           2           Absurd       0.08928571
## 2       Alone           3            Alone       0.13392857
## 3    Divorced         232         Divorced      10.35714286
## 4     Married         864          Married      38.57142857
## 5      Single         480           Single      21.42857143
## 6    Together         580         Together      25.89285714
## 7       Widow          77            Widow       3.43750000
## 8        YOLO           2             YOLO       0.08928571
# Plotting Distribution

# Specify numeric features
numeric_features <- c('Total_Spent', 'Num_Total_Purchases', 'Age', 'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')

# Loop through numeric features to create histograms
for (feature in numeric_features) {
  # Set up the plot
  plt <- ggplot(df, aes(x = .data[[feature]])) +
    geom_histogram(binwidth = (max(df[[feature]] ) - min(df[[feature]])) / 30, fill = "skyblue", color = "black", alpha = 0.7) +
    labs(title = paste("Distribution of", feature),
         x = feature,
         y = "Frequency") +
    theme_minimal()

  # Show the plot
  print(plt)
}

# Specify categorical features
categorical_features <- c('Education', 'Marital_Status')

# Loop through categorical features to create count plots
for (feature in categorical_features) {
  # Set up the plot
  plt <- ggplot(df, aes(x = .data[[feature]])) +
    geom_bar(fill = "skyblue", color = "black", alpha = 0.7) +
    labs(title = paste("Count of customers by", feature),
         x = feature,
         y = "Count") +
    theme_minimal()

  # Show the plot
  print(plt)
}

Customer Segmentation Analysis

# Explore relationship

# Scatter Plot
# Select the numeric features
numeric_features <- c('Total_Spent', 'Num_Total_Purchases', 'Age', 'Income', 'Recency', 'MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')

# Create scatter plots for each numeric feature against Total_Spent
for (feature in numeric_features) {
  scatter_plot <- ggplot(df, aes(x = df[[feature]], y = Total_Spent)) +
    geom_point() +
    labs(title = paste("Scatter Plot between", feature, "and Total_Spent"),
         x = feature,
         y = "Total_Spent") +
    theme_minimal()

  print(scatter_plot)
}
## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

## Warning: Use of `df[[feature]]` is discouraged.
## ℹ Use `.data[[feature]]` instead.

# Analyze "Income" on diff "Education"
# Set up the plot with group means
plt <- ggplot(df, aes(x = Education, y = Income, fill = Marital_Status)) +
  stat_summary(fun = mean, geom = "bar", position = "dodge") +
  labs(title = "Mean Income Across Demographic Groups",
       x = "Education",
       y = "Mean Income",
       fill = "Marital Status") +
  theme_minimal()

# Show the plot
print(plt)

# Analyze "Total Spend" on diff "Education"
# Set up the plot with group means
plt <- ggplot(df, aes(x = Education, y = Total_Spent, fill = Marital_Status)) +
  stat_summary(fun = mean, geom = "bar", position = "dodge") +
  labs(title = "Mean Income Across Demographic Groups",
       x = "Education",
       y = "Mean Total_Spent",
       fill = "Marital Status") +
  theme_minimal()

# Show the plot
print(plt)

# Purchasing Behavior Analysis - From the pie chart, we can deduce that the majority of revenue comes from the consumption of alcoholic beverages. - Furthermore, from the second and third charts, it is evident that the largest consumer group for alcoholic beverages consists of individuals with a marital status of “Married,” “Single,” or an educational background of “University.

# Specify the product categories
product_categories <- c('MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts', 'MntGoldProds')

# Create pie charts for each product category
# Calculate percentage of spending for each product category
percentage_spending <- colMeans(df[, product_categories]) / sum(colMeans(df[, product_categories])) * 100

# Create a data frame for the pie chart
pie_data <- data.frame(category = names(percentage_spending), percentage = percentage_spending)

# Create a pie chart
pie_chart <- ggplot(pie_data, aes(x = "", y = percentage, fill = category)) +
  geom_bar(stat = "identity", width = 1, color = "white") +
  coord_polar("y") +
  labs(title = "Percentage of Spending on Different Product Categories",
       x = NULL,
       y = NULL) +
  theme_minimal() +
  theme(legend.position = "bottom")

# Show the pie chart
print(pie_chart)

# Calculate percentage of spending for each product category by Marital_Status
percentage_spending <- aggregate(df[, product_categories], by = list(df$Marital_Status), FUN = function(x) sum(x) / sum(df[, product_categories]) * 100)

# Reshape the data for plotting
percentage_spending_long <- tidyr::gather(percentage_spending, key = "Product_Category", value = "Percentage", -Group.1)

# Create a bar plot
bar_plot <- ggplot(percentage_spending_long, aes(x = Product_Category, y = Percentage, fill = factor(Group.1))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Percentage of Spending on Different Product Categories by Marital Status",
       x = "Product Category",
       y = "Percentage",
       fill = "Marital Status") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(bar_plot)

# Calculate percentage of spending for each product category by "Education"
percentage_spending <- aggregate(df[, product_categories], by = list(df$Education), FUN = function(x) sum(x) / sum(df[, product_categories]) * 100)

# Reshape the data for plotting
percentage_spending_long <- tidyr::gather(percentage_spending, key = "Product_Category", value = "Percentage", -Group.1)

# Create a bar plot
bar_plot <- ggplot(percentage_spending_long, aes(x = Product_Category, y = Percentage, fill = factor(Group.1))) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Percentage of Spending on Different Product Categories by Education",
       x = "Product Category",
       y = "Percentage",
       fill = "Education") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(bar_plot)

# Box plot
box_plot <- ggplot(df, aes(x = as.factor(Children), y = Total_Spent)) +
  geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Total Spending by Number of Children/Teenagers",
       x = "Number of Children/Teenagers",
       y = "Total Spending") +
  theme_minimal()

# Show the plots
print(box_plot)

Web and Store Purchases Analysis

# Create box plots for each purchase channel
purchase_channels <- c('NumWebPurchases', 'NumCatalogPurchases', 'NumStorePurchases')

# Create box plots
box_plots <- list()

for (channel in purchase_channels) {
  box_plot <- ggplot(df, aes(x = as.factor(channel), y = df[[channel]])) +
    geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
    labs(title = paste("Distribution of Purchases through", channel),
         x = "Purchase Channel",
         y = "Number of Purchases") +
    theme_minimal()

  box_plots[[channel]] <- box_plot
}

# Print the box plots
for (channel in purchase_channels) {
  print(box_plots[[channel]])
}
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.

## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.

## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.

# Plotting box plot for purchase chanel by Marital_Status
for (channel in purchase_channels) {
  box_plot <- ggplot(df, aes(x = Marital_Status, y = df[[channel]])) +
    geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
    labs(title = paste("Distribution of", channel, "by Marital_Status"),
         x = "Marital_Status",
         y = paste("Number of", channel)) +
    theme_minimal()

  # Show the box plot for each channel
  print(box_plot)
}
## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.

## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.

## Warning: Use of `df[[channel]]` is discouraged.
## ℹ Use `.data[[channel]]` instead.

Discount Analysis

# Analyze the distribution of deals purchases
# Create a box plot for deals purchases by Marital_Status
box_plot <- ggplot(df, aes(x = Marital_Status, y = NumDealsPurchases)) +
  geom_boxplot(fill = "skyblue", color = "black", alpha = 0.7) +
  labs(title = "Distribution of Deals Purchases by Marital_Status",
       x = "Marital_Status",
       y = "Number of Deals Purchases") +
  theme_minimal()

# Show the box plot
print(box_plot)

# Hypothesis Testing Income Affects Spending

From the results, it is evident that there is a significant positive correlation in the statistical analysis between the income level of consumers and their expenditure on alcoholic beverages and meat products.

# Hypothesis: Income Affects Spending

# Create a new variable 'IncomeGroup' based on income levels
customer_data <- df %>%
  mutate(IncomeGroup = cut(Income, breaks = c(0, 30000, 60000, 90000, 120000, Inf),
                           labels = c("0-30k", "30k-60k", "60k-90k", "90k-120k", "120k+"),
                           include.lowest = TRUE))

# Check the summary statistics for the spending variables by income group
summary_table <- customer_data %>%
  group_by(IncomeGroup) %>%
  summarise(
    Mean_Wines = mean(MntWines),
    Mean_Fruits = mean(MntFruits),
    Mean_MeatProducts = mean(MntMeatProducts),
    Mean_FishProducts = mean(MntFishProducts),
    Mean_SweetProducts = mean(MntSweetProducts),
    Mean_GoldProds = mean(MntGoldProds)
  )

# Print summary table
print(summary_table)
## # A tibble: 5 × 7
##   IncomeGroup Mean_Wines Mean_Fruits Mean_MeatProducts Mean_FishProducts
##   <fct>            <dbl>       <dbl>             <dbl>             <dbl>
## 1 0-30k             13.8        5.75              21.5              8.01
## 2 30k-60k          171.        10.7               61.6             16.2 
## 3 60k-90k          587.        54.3              344.              76.0 
## 4 90k-120k         820.        66.8              599.             101.  
## 5 120k+             26.5        4.5              622.               4.25
## # ℹ 2 more variables: Mean_SweetProducts <dbl>, Mean_GoldProds <dbl>
# Perform ANOVA test for each spending variable
# Example for 'Wines' spending
anova_result_wines <- aov(MntWines ~ IncomeGroup, data = customer_data)
print(summary(anova_result_wines))
##               Df    Sum Sq  Mean Sq F value Pr(>F)    
## IncomeGroup    4 124853862 31213465   541.5 <2e-16 ***
## Residuals   2235 128819923    57638                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Repeat the ANOVA test for other spending variables
# Example for 'Fruits' spending
anova_result_fruits <- aov(MntFruits ~ IncomeGroup, data = customer_data)
print(summary(anova_result_fruits))
##               Df  Sum Sq Mean Sq F value Pr(>F)    
## IncomeGroup    4 1101648  275412   252.2 <2e-16 ***
## Residuals   2235 2440285    1092                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Repeat for other spending variables

# Post-hoc pairwise t-tests to identify specific income groups that differ significantly
# Example for 'Wines' spending
posthoc_wines <- TukeyHSD(anova_result_wines)
print(posthoc_wines)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = MntWines ~ IncomeGroup, data = customer_data)
## 
## $IncomeGroup
##                        diff        lwr        upr     p adj
## 30k-60k-0-30k     157.54739   117.8176  197.27719 0.0000000
## 60k-90k-0-30k     572.87628   531.5709  614.18163 0.0000000
## 90k-120k-0-30k    806.70781   703.2334  910.18221 0.0000000
## 120k+-0-30k        12.71892  -221.4968  246.93465 0.9998916
## 60k-90k-30k-60k   415.32889   384.3031  446.35466 0.0000000
## 90k-120k-30k-60k  649.16041   549.3435  748.97734 0.0000000
## 120k+-30k-60k    -144.82847  -377.4515   87.79455 0.4343004
## 90k-120k-60k-90k  233.83153   133.3771  334.28597 0.0000000
## 120k+-60k-90k    -560.15736  -793.0547 -327.26007 0.0000000
## 120k+-90k-120k   -793.98889 -1045.4684 -542.50933 0.0000000
# Repeat post-hoc tests for other spending variables
# Example for 'Fruits' spending
posthoc_fruits <- TukeyHSD(anova_result_fruits)
print(posthoc_fruits)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = MntFruits ~ IncomeGroup, data = customer_data)
## 
## $IncomeGroup
##                        diff        lwr       upr     p adj
## 30k-60k-0-30k      4.928495  -0.539713  10.39670 0.1001278
## 60k-90k-0-30k     48.524359  42.839299  54.20942 0.0000000
## 90k-120k-0-30k    61.054054  46.812359  75.29575 0.0000000
## 120k+-0-30k       -1.245946 -33.482216  30.99032 0.9999720
## 60k-90k-30k-60k   43.595863  39.325632  47.86609 0.0000000
## 90k-120k-30k-60k  56.125559  42.387261  69.86386 0.0000000
## 120k+-30k-60k     -6.174441 -38.191500  25.84262 0.9846753
## 90k-120k-60k-90k  12.529695  -1.296347  26.35574 0.0969113
## 120k+-60k-90k    -49.770305 -81.825112 -17.71550 0.0002265
## 120k+-90k-120k   -62.300000 -96.912377 -27.68762 0.0000095

Education Influences Total Spending - Null Hypothesis (H0): Education level does not impact total spending. - Alternative Hypothesis (H1): Customers with higher education levels have a higher total spending.

The results indicate that there is a statistically significant positive correlation between the educational background of consumers and their expenditure.

# Hypothesis: Education Influences Total Spending
# Assuming your dataframe is named 'customer_data'

# Create a new variable 'EducationGroup' based on education levels
customer_data <- customer_data %>%
  mutate(EducationGroup = as.factor(Education))

# Check unique values in the 'Education' column and recode if necessary
unique(customer_data$Education)
## [1] "Graduation" "PhD"        "Master"     "Basic"      "2n Cycle"
# Recode if needed (replace 'Basic' with a specific level)
# customer_data$Education <- recode(customer_data$Education, 'Basic' = 'Basic_Level')

# Check the summary statistics for the spending variables by education group
summary_table_education <- customer_data %>%
  group_by(EducationGroup) %>%
  summarise(
    Total_Spending = MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds
  )
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'EducationGroup'. You can override using
## the `.groups` argument.
# Print summary table
print(summary_table_education)
## # A tibble: 2,240 × 2
## # Groups:   EducationGroup [5]
##    EducationGroup Total_Spending
##    <fct>                   <int>
##  1 2n Cycle                  133
##  2 2n Cycle                   81
##  3 2n Cycle                  122
##  4 2n Cycle                 1274
##  5 2n Cycle                  109
##  6 2n Cycle                   18
##  7 2n Cycle                   79
##  8 2n Cycle                  978
##  9 2n Cycle                   72
## 10 2n Cycle                  577
## # ℹ 2,230 more rows
# Perform ANOVA test for total spending
anova_result_education <- aov(Total_Spent ~ EducationGroup, data = customer_data)
print(summary(anova_result_education))
##                  Df    Sum Sq Mean Sq F value   Pr(>F)    
## EducationGroup    4  19644802 4911201   13.85 3.66e-11 ***
## Residuals      2235 792449913  354564                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Post-hoc pairwise t-tests to identify specific education groups that differ significantly
posthoc_education <- TukeyHSD(anova_result_education)
print(posthoc_education)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Total_Spent ~ EducationGroup, data = customer_data)
## 
## $EducationGroup
##                            diff          lwr        upr     p adj
## Basic-2n Cycle      -414.730797 -663.6349059 -165.82669 0.0000557
## Graduation-2n Cycle  123.371753   -0.5724578  247.31596 0.0517520
## Master-2n Cycle      115.253987  -26.7298247  257.23780 0.1740320
## PhD-2n Cycle         175.882371   40.0341803  311.73056 0.0038144
## Graduation-Basic     538.102550  311.6503663  764.55473 0.0000000
## Master-Basic         529.984785  293.1772106  766.79236 0.0000000
## PhD-Basic            590.613169  357.4326479  823.79369 0.0000000
## Master-Graduation     -8.117765 -105.5176591   89.28213 0.9994067
## PhD-Graduation        52.510619  -35.7054083  140.72665 0.4814023
## PhD-Master            60.628384  -51.5291078  172.78588 0.5784234

Parenthood Affects Total Spending - Null Hypothesis (H0): There is no significant difference in total spending between customers with and without children. - Alternative Hypothesis (H1): Customers with children have different total spending habits compared to customers without children.

From the previous box plot, we observed a negative correlation between the number of children and expenditure. We can further validate this using a t-test. The results indicate that consumers with children have significantly lower total expenditure compared to those without children.

# Hypothesis: Parenthood Affects Total Spending
# Assuming your dataframe is named 'customer_data'

# Create a new binary variable 'HasChildren' indicating whether the customer has children
customer_data <- df %>%
  mutate(HasChildren = ifelse(Children > 0, "With Children", "Without Children"))

# Check the summary statistics for the spending variables by parenthood status
summary_table_parenthood <- customer_data %>%
  group_by(HasChildren) %>%
  summarise(
    Total_Spending = MntWines + MntFruits + MntMeatProducts + MntFishProducts + MntSweetProducts + MntGoldProds
  )
## Warning: Returning more (or less) than 1 row per `summarise()` group was deprecated in
## dplyr 1.1.0.
## ℹ Please use `reframe()` instead.
## ℹ When switching from `summarise()` to `reframe()`, remember that `reframe()`
##   always returns an ungrouped data frame and adjust accordingly.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `summarise()` has grouped output by 'HasChildren'. You can override using the
## `.groups` argument.
# Perform t-test for total spending between customers with and without children
t_test_result <- t.test(Total_Spent ~ Parental_Status, data = customer_data)
print(t_test_result)
## 
##  Welch Two Sample t-test
## 
## data:  Total_Spent by Parental_Status
## t = 25.071, df = 894.24, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  644.6949 754.2049
## sample estimates:
## mean in group 0 mean in group 1 
##       1106.0298        406.5799

Age and Spending Habits - Null Hypothesis (H0): There is no correlation between age and total spending. - Alternative Hypothesis (H1): Younger customers exhibit different spending patterns compared to older customers.

The t-value is 5.2986, and the corresponding p-value is 1.282e-07. This extremely small p-value indicates that we can reject the null hypothesis, i.e., the true correlation is not equal to 0. This implies that the correlation between age and total spending is statistically significant.

# Hypothesis: Age and Spending Habits
# Check the correlation between age and total spending
correlation_result <- cor.test(df$Age, 
                               df$Total_Spent, 
                               method = "pearson")

# Print correlation result
print(correlation_result)
## 
##  Pearson's product-moment correlation
## 
## data:  df$Age and df$Total_Spent
## t = 5.2986, df = 2238, p-value = 1.282e-07
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.07021422 0.15202145
## sample estimates:
##       cor 
## 0.1113064

Web Visits Influence Web Purchases - Null Hypothesis (H0): The number of web visits does not affect the number of web purchases. - Alternative Hypothesis (H1): Customers who visit the website more frequently are more likely to make web purchases.

From this test, we can observe a statistically significant negative correlation between the number of website visits and the number of website purchases.

One possible explanation is that consumers who spend more time browsing online may engage in window shopping or informational searches without necessarily intending to make immediate purchases. On the other hand, those who spend less time browsing might have clearer preferences or specific purchase intentions, leading to more direct and purposeful buying behavior.

# Hypothesis: Web Visits Influence Web Purchases
# Check the correlation between web visits and web purchases
correlation_result <- cor.test(df$NumWebVisitsMonth, 
                               df$NumWebPurchases, 
                               method = "pearson")

# Print correlation result
print(correlation_result)
## 
##  Pearson's product-moment correlation
## 
## data:  df$NumWebVisitsMonth and df$NumWebPurchases
## t = -2.6461, df = 2238, p-value = 0.0082
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.09703774 -0.01446393
## sample estimates:
##         cor 
## -0.05584633
# Perform linear regression
regression_model <- lm(NumWebPurchases ~ NumWebVisitsMonth, data = customer_data)

# Print regression summary
summary(regression_model)
## 
## Call:
## lm(formula = NumWebPurchases ~ NumWebVisitsMonth, data = customer_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.4248 -2.0411 -0.3609  1.7670 22.6391 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        4.42481    0.14123  31.330   <2e-16 ***
## NumWebVisitsMonth -0.06395    0.02417  -2.646   0.0082 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.775 on 2238 degrees of freedom
## Multiple R-squared:  0.003119,   Adjusted R-squared:  0.002673 
## F-statistic: 7.002 on 1 and 2238 DF,  p-value: 0.0082